<!DOCTYPE html>
<html>
<head><meta name="generator" content="Hexo 3.9.0">
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  
  <title>NLP中的注意力机制简介（二） | Rogerspy&#39;s Home</title>
  
  <meta name="keywords" content="Machine Learning, Deep Learning, NLP">
  
  

  
  <link rel="alternate" href="/atom.xml" title="Rogerspy's Home">
  

  <meta name="HandheldFriendly" content="True">
  <meta name="apple-mobile-web-app-capable" content="yes">
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
  <!-- meta -->
  
  
  <meta name="theme-color" content="#FFFFFF">
  <meta name="msapplication-TileColor" content="#1BC3FB">
  <meta name="msapplication-config" content="https://cdn.jsdelivr.net/gh/xaoxuu/assets@master/favicon/favicons/browserconfig.xml">
  

  <!-- link -->
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/fancyapps/fancybox@3.5.7/dist/jquery.fancybox.min.css">
  
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/node-waves@0.7.6/dist/waves.min.css">
  
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@5.10.1/css/all.min.css">
  
  
  <link rel="shortcut icon" type="image/x-icon" href="https://cdn.jsdelivr.net/gh/xaoxuu/assets@master/favicon/favicon.ico">
  <link rel="icon" type="image/x-icon" sizes="32x32" href="https://cdn.jsdelivr.net/gh/xaoxuu/assets@master/favicon/favicons/favicon-32x32.png">
  <link rel="apple-touch-icon" type="image/png" sizes="180x180" href="https://cdn.jsdelivr.net/gh/xaoxuu/assets@master/favicon/favicons/apple-touch-icon.png">
  <link rel="mask-icon" color="#1BC3FB" href="https://cdn.jsdelivr.net/gh/xaoxuu/assets@master/favicon/favicons/safari-pinned-tab.svg">
  <link rel="manifest" href="https://cdn.jsdelivr.net/gh/xaoxuu/assets@master/favicon/favicons/site.webmanifest">
  

  

  
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/xaoxuu/cdn-material-x@19.5/css/style.css">
  

  <script>
    function setLoadingBarProgress(num) {
      document.getElementById('loading-bar').style.width=num+"%";
    }
  </script>
  

  
  
  <!-- 时间线 -->
  <link rel="stylesheet" href="/css/timeline.css">
  <!-- 血小板-->
  <link rel="stylesheet" href="/live2d/css/live2d.css">
  <style>
	.article p .mjx-math {
	    font-family: Menlo,Monaco,courier,monospace,"Lucida Console",'Source Code Pro',"Microsoft YaHei",Helvetica,Arial,sans-serif,Ubuntu;
        background: none;
        padding: 2px;
        border-radius: 4px;
	}
  </style>
</head>

<body>
  
  
  <div class="cover-wrapper">
    <cover class='cover post half'>
      
        
  <h1 class='title'>Rogerspy's Home</h1>


  <div class="m_search">
    <form name="searchform" class="form u-search-form">
      <input type="text" class="input u-search-input" placeholder="" />
      <i class="icon fas fa-search fa-fw"></i>
    </form>
  </div>

<div class='menu navgation'>
  <ul class='h-list'>
    
      
        <li>
          <a class="nav home" href="/"
            
            
            id="home">
            <i class='fas fa-edit fa-fw'></i>&nbsp;博文
          </a>
        </li>
      
        <li>
          <a class="nav home" href="/video/"
            
            
            id="video">
            <i class='fas fa-film fa-fw'></i>&nbsp;视频
          </a>
        </li>
      
        <li>
          <a class="nav home" href="/material/"
            
              rel="nofollow"
            
            
            id="material">
            <i class='fas fa-briefcase fa-fw'></i>&nbsp;资料
          </a>
        </li>
      
        <li>
          <a class="nav home" href="/about/"
            
              rel="nofollow"
            
            
            id="about">
            <i class='fas fa-info-circle fa-fw'></i>&nbsp;关于
          </a>
        </li>
      
    
  </ul>
</div>

      
    </cover>
    <header class="l_header pure">
  <div id="loading-bar-wrapper">
    <div id="loading-bar" class="pure"></div>
  </div>

	<div class='wrapper'>
		<div class="nav-main container container--flex">
      <a class="logo flat-box" href='/' >
        
          Rogerspy's Home
        
      </a>
			<div class='menu navgation'>
				<ul class='h-list'>
          
  					
  						<li>
								<a class="nav flat-box" href="/blog/"
                  
                  
                  id="blog">
									<i class='fas fa-edit fa-fw'></i>&nbsp;博客
								</a>
							</li>
      			
  						<li>
								<a class="nav flat-box" href="/video/"
                  
                  
                  id="video">
									<i class='fas fa-film fa-fw'></i>&nbsp;视频小站
								</a>
							</li>
      			
  						<li>
								<a class="nav flat-box" href="/material/"
                  
                  
                  id="material">
									<i class='fas fa-briefcase fa-fw'></i>&nbsp;学习资料
								</a>
							</li>
      			
  						<li>
								<a class="nav flat-box" href="/diary/"
                  
                  
                  id="diary">
									<i class='fas fa-book fa-fw'></i>&nbsp;随心记
								</a>
							</li>
      			
  						<li>
								<a class="nav flat-box" href="/categories/"
                  
                    rel="nofollow"
                  
                  
                  id="categories">
									<i class='fas fa-folder-open fa-fw'></i>&nbsp;分类
								</a>
							</li>
      			
  						<li>
								<a class="nav flat-box" href="/tags/"
                  
                    rel="nofollow"
                  
                  
                  id="tags">
									<i class='fas fa-hashtag fa-fw'></i>&nbsp;标签
								</a>
							</li>
      			
  						<li>
								<a class="nav flat-box" href="/blog/archives/"
                  
                    rel="nofollow"
                  
                  
                  id="blogarchives">
									<i class='fas fa-archive fa-fw'></i>&nbsp;归档
								</a>
							</li>
      			
      		
				</ul>
			</div>

			
				<div class="m_search">
					<form name="searchform" class="form u-search-form">
						<input type="text" class="input u-search-input" placeholder="搜索" />
						<i class="icon fas fa-search fa-fw"></i>
					</form>
				</div>
			
			<ul class='switcher h-list'>
				
					<li class='s-search'><a class="fas fa-search fa-fw" href='javascript:void(0)'></a></li>
				
				<li class='s-menu'><a class="fas fa-bars fa-fw" href='javascript:void(0)'></a></li>
			</ul>
		</div>

		<div class='nav-sub container container--flex'>
			<a class="logo flat-box"></a>
			<ul class='switcher h-list'>
				<li class='s-comment'><a class="flat-btn fas fa-comments fa-fw" href='javascript:void(0)'></a></li>
        
          <li class='s-toc'><a class="flat-btn fas fa-list fa-fw" href='javascript:void(0)'></a></li>
        
			</ul>
		</div>
	</div>
</header>
	<aside class="menu-phone">
    <header>
		<nav class="menu navgation">
      <ul>
        
          
            <li>
							<a class="nav flat-box" href="/"
                
                
                id="home">
								<i class='fas fa-clock fa-fw'></i>&nbsp;近期文章
							</a>
            </li>
          
            <li>
							<a class="nav flat-box" href="/blog/archives/"
                
                  rel="nofollow"
                
                
                id="blogarchives">
								<i class='fas fa-archive fa-fw'></i>&nbsp;文章归档
							</a>
            </li>
          
            <li>
							<a class="nav flat-box" href="/blog/"
                
                
                id="blog">
								<i class='fas fa-edit fa-fw'></i>&nbsp;我的博客
							</a>
            </li>
          
            <li>
							<a class="nav flat-box" href="/video/"
                
                  rel="nofollow"
                
                
                id="video">
								<i class='fas fa-film fa-fw'></i>&nbsp;我的视频
							</a>
            </li>
          
            <li>
							<a class="nav flat-box" href="/material/"
                
                  rel="nofollow"
                
                
                id="material">
								<i class='fas fa-briefcase fa-fw'></i>&nbsp;学习资料
							</a>
            </li>
          
            <li>
							<a class="nav flat-box" href="/about/"
                
                  rel="nofollow"
                
                
                id="about">
								<i class='fas fa-info-circle fa-fw'></i>&nbsp;关于小站
							</a>
            </li>
          
       
      </ul>
		</nav>
    </header>
	</aside>
<script>setLoadingBarProgress(40);</script>

  </div>


  <div class="l_body">
    <div class='body-wrapper'>
      <div class='l_main'>
  

  
    <article id="post" class="post white-box article-type-post" itemscope itemprop="blogPost">
      


  <section class='meta'>
    
    
    <div class="meta" id="header-meta">
      
        
  
    <h1 class="title">
      <a href="/2019/08/27/NLP中的注意力机制简介（二）/">
        NLP中的注意力机制简介（二）
      </a>
    </h1>
  


      
      <div class='new-meta-box'>
        
          
        
          
            
  <div class='new-meta-item author'>
    <a href="https://rogerspy.gitee.io" rel="nofollow">
      
        <i class="fas fa-user" aria-hidden="true"></i>
      
      <p>Rogerspy</p>
    </a>
  </div>


          
        
          
            <div class="new-meta-item date">
  <a class='notlink'>
    <i class="fas fa-calendar-alt" aria-hidden="true"></i>
    <p>2019-08-27</p>
  </a>
</div>

          
        
          
            
  
  <div class='new-meta-item category'>
    <a href='/categories/nlp/' rel="nofollow">
      <i class="fas fa-folder-open" aria-hidden="true"></i>
      <p>NLP</p>
    </a>
  </div>


          
        
          
            
  
    <div class="new-meta-item browse busuanzi">
      <a class='notlink'>
        <i class="fas fa-eye" aria-hidden="true"></i>
        <p>
          <span id="busuanzi_value_page_pv">
            <i class="fas fa-spinner fa-spin fa-fw" aria-hidden="true"></i>
          </span>
        </p>
      </a>
    </div>
  


          
        
          
            

          
        
          
            
  
    <div style="margin-right: 10px;">
      <span class="post-time">
        <span class="post-meta-item-icon">
          <i class="fa fa-keyboard"></i>
          <span class="post-meta-item-text">  字数统计: </span>
          <span class="post-count">5.6k字</span>
        </span>
      </span>
      &nbsp; | &nbsp;
      <span class="post-time">
        <span class="post-meta-item-icon">
          <i class="fa fa-hourglass-half"></i>
          <span class="post-meta-item-text">  阅读时长≈</span>
          <span class="post-count">23分</span>
        </span>
      </span>
    </div>
  

          
        
      </div>
      
        <hr>
      
    </div>
  </section>


      <section class="article typo">
        <div class="article-entry" itemprop="articleBody">
          <h1>
    <small>——Transformer专题篇</small>
</h1>



<h1 id="1-前言"><a href="#1-前言" class="headerlink" title="1. 前言"></a>1. 前言</h1><p>之前我们介绍了各种各样的注意力机制，如果仔细回想一下就可以发现无论是哪种注意力机制都不是单独出现的，都是伴随着<em>RNN</em>或者其他<em>RNN</em>的变种。这种基于<em>RNN</em>的注意力机制会面临一个问题就是，难以处理长序列的句子，因为无法实现并行计算，所以非常消耗计算资源。</p>
<a id="more"></a>
<p><em>CNN</em>虽然可以实现并行计算，但是它无法获得序列的位置信息，这样它也就难以获得远距离信息上的依赖关系。后来虽然有人提出了完全基于<em>CNN</em>的<em>seq2seq</em>模型，但是却非常消耗内存。</p>
<p>由于基于<em>RNN</em>的注意力机制遇到了计算资源上的瓶颈，<a href="https://arxiv.org/pdf/1706.03762.pdf" target="_blank" rel="noopener">Vaswani et al., 2017</a>提出了一个新的模型——<strong>Transformer！</strong>从目前的发展来看，这个模型对得起这个名字，因为它真的很能打，自从2018年基于<em>Transformer</em>的<em>BERT</em>预训练语言模型的横空出世到今天，几乎每一次<em>NLP</em>的重大进展都与它息息相关。因此，我们专门开一个专题篇来详细介绍一下这个模型。</p>
<p><strong>Transformer</strong>的创新点在于抛弃了之前传统的基于<em>CNN</em>或者<em>RNN</em>的<em>encoder-decoder</em>模型的固有模式，只用<em>Attention</em>实现<em>encoder-decoder</em>。<em>Transformer</em>的主要目的在于减少计算量和提高并行效率的同时不损害最终的实验结果。</p>
<h1 id="2-模型结构"><a href="#2-模型结构" class="headerlink" title="2. 模型结构"></a>2. 模型结构</h1><h2 id="2-1-模型结构总览"><a href="#2-1-模型结构总览" class="headerlink" title="2.1 模型结构总览"></a>2.1 模型结构总览</h2><p>在初始的论文中<em>Transformer</em>仍然是被用于机器翻译任务上：</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/the_transformer_3.png" alt="img"></p>
<p>下面我们来打开擎天柱，看下它到底是怎么构成的？</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/The_transformer_encoders_decoders.png" alt="img"></p>
<p>可以看到<em>Transformer</em>仍然采用了<em>encoder-decoder</em>结构，原始论文中<em>encoder</em>是由6个相同的编码模块堆叠而成（这里的相同指的是结构相同，但是其中的权重却是不共享的， 下面的解码器与之相同），而<em>decoder</em>同样也是用6个解码器堆叠而成的。<strong>6</strong>这个数字并没有什么特殊之处，只是原始论文使用的层数，我们可以在实验过程中任意设置层数。如下图所示：</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/The_transformer_encoder_decoder_stack.png" alt="img"></p>
<p>注意一个细节，<em>encoder</em>和<em>decoder</em>的链接方式是<em>encoder</em>的最后一层输出与<em>decoder</em>的每一层相连。下面我们打开其中一个编码器和解码器看下里面是什么结构：</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/Transformer_decoder.png" alt="img"></p>
<p><em>encoder</em>的输入先经过一个自注意力层对句子进行编码获取词的注意力权重，然后将自注意力输入给一个全连接层。</p>
<p>对于<em>decoder</em>中自注意力层和全连接层和<em>encoder</em>相同，但是在自注意力输出后要经过一个注意力编码层与<em>encoder</em>进行联合编码，然后再传入给全连接层。这个联合编码其实就类似于在<em>seq2seq</em>模型中解码过程中，<em>decoder</em>隐状态和注意力联合编码然后再输出的过程是类似的。</p>
<p>下面我们继续拆解擎天柱的零件，看看这个自注意力和全连接层下面埋藏着什么秘密。</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/1566026177834.png" alt></p>
<p>原来所谓的<em>Self-attention</em>是一个叫做<em>Multi-Head Attention</em>的东西，这个就是擎天柱的核心部件了。其他的小零件比如<em>Add, Norm, Linear, softmax, Feed Forward</em>等等都是在五金店都能买到的小玩意儿。下面我们就详细看下这个能量块到底隐藏着什么秘密吧。</p>
<h2 id="2-2-Multi-Head-Attention"><a href="#2-2-Multi-Head-Attention" class="headerlink" title="2.2 Multi-Head Attention"></a>2.2 Multi-Head Attention</h2><p>从字面上就可以看出所谓<em>Multi-Head Attention</em>是由两部分组成——<em>Multi-Head</em>和<em>Attention</em>，而实际情况也是如此。其实严格来讲这是一个一般化的名称，如果具体到论文使用的自注意力机制的话，应该叫做<em>Self-Multi-Head Attention</em>。应该有三部分组成，除了上面提到的两个，还应该加上一个自注意力。</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/e4bc841abc55d366813340f92f6696c5d59e95.png" alt></p>
<p>还记得我们上一篇介绍注意力机制的文章中提到一种使用$\mathbf{k}, \mathbf{q}, \mathbf{v}$计算注意力的注意力机制。而<em>Scaled Dot-Product</em>也是我们之前提到的一种计算<em>Alignment score function</em>的方法。也就是说，图中下半部分是在计算<em>Scaled-Dot Product Attention</em>，而图中上半部分的<em>Concat</em>操作就是拼接操作，拼接什么？仔细看下半部分计算注意力的时候，不是只计算一个<em>Scaled-Dot Product Attention</em>，而是在同时计算<em>h</em>个<em>Scaled-Dot Product Attention</em>。而<em>Concat</em>拼接的的就是这<em>h</em>个<em>Scaled-Dot Product Attention</em>。这就是所谓的<em>Multi-Head Attention</em>，每一个<em>Head</em>就是一个<em>Scaled-Dot Product Attention</em>。</p>
<p>形式化描述如下：</p>
<script type="math/tex; mode=display">
MultiHead(Q, K, V) = Concat(head_1, head_2, ..., head_h)W^O</script><script type="math/tex; mode=display">
head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)</script><p>其中$W_i^Q \in \mathbb{R}^{d_{model}\times d_q}$，$W_i^K \in \mathbb{R}^{d_{model}\times d_k}$，$W_i^V \in \mathbb{R}^{d_{model}\times d_v}$，$W_i^O \in \mathbb{R}^{d_{model}\times d_o}$。在原始论文中$h=8$，$d_k=d_v=d_{model}/h=64$。由于对每一个<em>Head</em>进行了降维，所以总的计算量和使用一个单独的不降维的<em>Head</em>是一样的。本文涉及到的公式所用标记都与原论文保持一致，避免混淆。</p>
<p>我们仔细思考一下这个<em>Multi-Head Attention</em>，和我们提到的<em>Multi-dimensional Attention</em>有异曲同工之妙。论文里提到，相对于使用单个注意力而言，使用<em>Multi-Head</em>能获得更好的效果。但是论文并没有解释为什么。我们这里结合<em>Multi-dimensional Attention</em>做一个大胆的猜想：<em>Transformer</em>的强大之处正是由于这个<em>Multi-Head</em>！因为多维注意力机制能够获得一句话中在不同语境下的不同理解。而在语言模型中，词语和句子的歧义性一直是自然语言处理的难点，而<em>Transformer</em>在多维注意力机制的作用下能够很好的获取句子的多重含义，并且能根据上下文信息自动获取正确的语义，因此<em>Transformer</em>能够在预训练语言模型中大放异彩。</p>
<p>下面我们就应该看一下这个核心中的核心——<em>Scaled-Dot Product Attention</em>了。</p>
<h2 id="2-3-Scaled-Dot-Product-Attention"><a href="#2-3-Scaled-Dot-Product-Attention" class="headerlink" title="2.3 Scaled-Dot Product Attention"></a>2.3 Scaled-Dot Product Attention</h2><p><em>Scaled-Dot Product Attention</em>结构如图所示：</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/1566031501224.png" alt></p>
<p>首先我们先给出其形式化定义：</p>
<script type="math/tex; mode=display">
Attention(K, Q, V) = softmax(\frac{QK^T}{\sqrt {d_k}})V</script><p>我们把图中结构分解开来，一步一步解释清楚：</p>
<ul>
<li>第一步<em>MatMul</em>：计算<em>Alignment score function</em></li>
</ul>
<script type="math/tex; mode=display">
MatMul(Q, K_i) = QK_i^T</script><ul>
<li>第二步<em>Scale</em>：调节<em>Alignment score function</em>分数</li>
</ul>
<script type="math/tex; mode=display">
Scale(Q, K_i) = \frac{Q,K_i^T}{\sqrt{d_k}}</script><ul>
<li>第三步<em>Mask</em>（可选）：<em>encoder</em>不需要这一步，<em>decoder</em>需要这一步。</li>
</ul>
<p>这里的masked就是要在训练语言模型的时候，不给模型看到未来的信息， 让模型自己预测下一个词。</p>
<ul>
<li>第四步<em>Softmax</em>：相似度归一化</li>
</ul>
<script type="math/tex; mode=display">
\alpha_i = Softmax(Q, K_i) =softmax(\frac{QK_i^T}{\sqrt {d_k}})</script><ul>
<li>第五步<em>MatMul</em>：通过计算出来的权重与$V$加权求和得到最终的<em>Attention</em>向量</li>
</ul>
<script type="math/tex; mode=display">
Attention(K_i, Q, V_i) = \sum_i \alpha_i V_i</script><p>下面我们从序列输入开始详细解释一下每一步到底是在做什么。</p>
<ul>
<li>第一步：计算<em>K, Q, V</em></li>
</ul>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/transformer_self_attention_vectors.png" alt="img"></p>
<p>将输入的每一个词转化成词向量，词向量可以是预训练好的（比如用<em>word2vec</em>）词向量，在网络训练过程中固定词向量矩阵不参与训练，也可以是随时初始化，然后随着网络的训练不断更新词向量矩阵。</p>
<p>将序列中每个元素对应的词向量分别与$W^Q, W^K, W^V$相乘计算得到<em>queries, keys, values</em>。计算得到的<em>queries, keys, values</em>的维度为64，当然维度缩减并非必须。</p>
<ul>
<li>第二步：计算自注意力的<em>alignment score function</em></li>
</ul>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/transformer_self_attention_score.png" alt="img"></p>
<p>所谓自注意力就是计算序列中两两元素之前的依赖关系，这个分数表示当前这个词需要把多少注意力放在句子中的其他部分上。</p>
<p>以上图举例，<em>Thinking</em>的得分是$\mathbf{q}_1\cdot \mathbf{k}_1=112$，<em>Machines</em>的得分是$\mathbf{q}_1\cdot \mathbf{k}_2=96$，后面的以此类推。</p>
<ul>
<li>第三步和第四步：对上一步的得分进行缩放，然后计算<em>softmax</em></li>
</ul>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/self-attention_softmax.png" alt="img"></p>
<p>上一步的得分除以8（因为前面我们提到<em>queries, keys, values</em>的维度为64，开方之后就是8）。之所以要做这个缩放论文给出的解释是防止这个得分过大或者过小，在做<em>softmax</em>的时候不是0就是1，这样的话就不够<em>soft</em>了。</p>
<p>得到放缩后的得分之后就是计算<em>softmax</em>了。</p>
<ul>
<li>第五步：在<em>decoder</em>中对句子进行<em>Mask</em>。比如输入是一句话 “i have a dream” 总共4个单词，这里就会形成一张4x4的注意力机制的图： </li>
</ul>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/1.png" alt></p>
<p>这里的<em>mask</em>就是要在做语言建模的时候，不给模型看到未来的信息，让模型自己预测后面的信息。比如上图中，“I”作为第一个词，只能和自己进行<em>Attention</em>；“have”作为第二个词，可以和“I”和“have”本身进行<em>Attention</em>；“a”作为第三个单词，可以和“I”，“have”，“a” 三个单词进行<em>Attention</em>；到了最后一个单词“dream”的时候，才有对整个句子4个单词的<em>attention</em>。</p>
<ul>
<li>第六步：用上面计算出来的<em>softmax</em>与<em>values</em>进行加权求和</li>
</ul>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/self-attention-output.png" alt="img"></p>
<p>至此<em>Scaled-Dot Product Attention</em>就计算完了。</p>
<h2 id="2-4-矩阵计算-Scaled-Dot-Product-Attention"><a href="#2-4-矩阵计算-Scaled-Dot-Product-Attention" class="headerlink" title="2.4 矩阵计算 Scaled-Dot Product Attention"></a>2.4 矩阵计算 Scaled-Dot Product Attention</h2><p>前面我们说过，<em>Transformer</em>的初衷就是并行计算，所谓并行计算就是矩阵计算，上面的例子是通过一个一个向量进行计算的，如果我们把向量堆叠成矩阵，就可以实现并行运算了：</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/self-attention-matrix-calculation.png" width="40%" align="left"><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/self-attention-matrix-calculation-2.png" width="60%" align="right"></p>
<h2 id="2-5-矩阵计算-Multi-Head"><a href="#2-5-矩阵计算-Multi-Head" class="headerlink" title="2.5 矩阵计算 Multi-Head"></a>2.5 矩阵计算 Multi-Head</h2><p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs//transformer_multi-headed_self-attention-recap.png" alt="img"></p>
<h2 id="2-6-Position-Encoding"><a href="#2-6-Position-Encoding" class="headerlink" title="2.6 Position Encoding"></a>2.6 Position Encoding</h2><p>到目前为止，擎天柱的能量核心结构我们介绍完了，但是我们还忽略了一个问题：句子是一个有序的序列，句子中两个词的位置互换的话，这个句子的意思完全不同了，因此在处理自然语言的时候词与词的绝对位置或者相对位置也是一个非常重要的信息。</p>
<p>为了解决位置信息的问题，<em>Transformer</em>在每一个输入向量中增加了一个位置向量，这个位置向量的维度为$d_{model}$，这样输入向量和位置向量就可以 直接相加了。</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/transformer_positional_encoding_example.png" alt="img"></p>
<p>在NLP的很多模型中都有位置向量的使用，比如前面我们提到基于<em>CNN</em>的<em>seq2seq</em>模型（<a href="https://arxiv.org/abs/1705.03122" target="_blank" rel="noopener">Gehring et al., 2017</a>）。但是通常其他模型中的位置向量都是通过学习得来的，本文采用的是直接通过函数构造出来的：</p>
<script type="math/tex; mode=display">
\left\{
\begin{aligned}
PE_{(pos, 2i)} & =  \sin(pos/10000^{2i/d_{model}}) \\\\
PE_{(pos, 2i+1)} & =  \cos(pos/10000^{2i/d_{model}}) 
\end{aligned}
\right.</script><p>其中$pos$表示位置索引，$i$表示维度索引。也就是说位置向量中的每一维都是一个余弦曲线，波长是一个从$2\pi$到$10000 \cdot 2\pi$的等比数列。之所以选择这个函数，是因为它允许模型能很容易的学习到相对位置信息，因为对于任意固定的偏置$k$，$PE_{pos+k}$能通过一个$PE_{pos} $的线性函数推理得到：</p>
<script type="math/tex; mode=display">
\begin{aligned}
\sin(\alpha+\beta) &= \sin(\alpha) \cos(\beta) + \cos(\alpha)\sin(\beta)\\\\
\cos(\alpha+\beta) &= \cos(\alpha) \cos(\beta) - \sin(\alpha)\sin(\beta) 
\end{aligned}</script><p>另外，作者也做过实验，使用通过学习得到的位置向量，最后发现两者的效果差别不大。在效果差别不大的情况下使用直接构造的方法能够避免训练过程的权重更新，这样可以加速训练。另外一个很重要的原因就是，选择这个余弦版本的位置向量还可以处理比训练时遇到的更长的序列。</p>
<h2 id="2-6-残差结构"><a href="#2-6-残差结构" class="headerlink" title="2.6 残差结构"></a>2.6 残差结构</h2><p>如果我们仔细看模型结构图就会发现，数据的流向并不是从一层单项流向下一层的这种简单的串联结构，而是采用了类似残差网络的残差式连接。</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/1566120593103.png" alt></p>
<ul>
<li>第一步：计算<em>Multi-Head Attention</em></li>
<li>第二步：原始的输入+<em>Multi-Head Attention</em></li>
<li>第三步：使用<em>LayerNorm</em>进行正则化</li>
<li>第四步：正则化后的数据经过全连接层，全连接层的激活函数使用<em>ReLU</em>函数。注意这里面的全连接层是每个位置每个位置单独进行计算的，其实更像是卷积核大小为1的卷积层。</li>
<li>第五步：第三步正则化的数据与全连接层后的数据相加</li>
<li>第六步：第五步相加后的数据再次正则化</li>
</ul>
<p>这就是<em>Transformer</em>的残差网络计算过程。</p>
<p>到目前为止，擎天柱身上的主要零部件我们都已经介绍完了，接下来就该把这些零部件再组装回去了。</p>
<h1 id="3-模型组装"><a href="#3-模型组装" class="headerlink" title="3. 模型组装"></a>3. 模型组装</h1><h2 id="3-1-Encoder"><a href="#3-1-Encoder" class="headerlink" title="3.1 Encoder"></a>3.1 Encoder</h2><p><em>Encoder</em>包含6个相同的层</p>
<ul>
<li>每一层包含2个<em>sub-layer</em>：<em>Multi-Head Attention</em>和全连接层。</li>
<li>每个<em>sub-layer</em>都要正则化</li>
<li><em>sub-layer</em>内部通过残差结构连接</li>
<li>每一层的输出维度为$d_{model}=512$</li>
</ul>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/transformer_resideual_layer_norm_2.png" width="60%"></p>
<h2 id="3-2-Decoder"><a href="#3-2-Decoder" class="headerlink" title="3.2 Decoder"></a>3.2 Decoder</h2><p>Decoder也是6层</p>
<ul>
<li>每层包含3个<em>sub-layer</em>：<em>Multi-Head Attention</em>，<em>Encoder-Decoder Attention</em>和全连接层</li>
<li>其中<em>Encoder-Decoder Attention</em>的结构和其他的注意力相同，但是不同的是这一层的$K, V$都是来源于<em>Encoder</em>，而$Q$来源于上一层注意力产生的。</li>
<li><em>Decoder</em>中的<em>Multi-Head Attention</em>层需要进行修改，因为只能获取到当前时刻之前的输入，因此只对时刻 <em>t</em> 之前的时刻输入进行<em>Attention</em>计算，这也称为<em>Mask</em>操作</li>
</ul>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/transformer_resideual_layer_norm_3.png" alt="img"></p>
<h2 id="3-3-最后的Linear层和Softmax层"><a href="#3-3-最后的Linear层和Softmax层" class="headerlink" title="3.3 最后的Linear层和Softmax层"></a>3.3 最后的Linear层和Softmax层</h2><p><em>Decoder</em>输出一个向量，我们怎么把这个向量转化成单词呢？这就是最后的<em>Linear</em>和<em>Softmax</em>层做的事情。</p>
<p>线性变换层是一个简单的全连接层，将<em>Decoder</em>输出的向量转化成一个<em>logits vector</em>。假设模型的词表中有10000个词，这个<em>logits vector</em>的维度就是10000，每一维对应词表中的一个词的得分。</p>
<p>然后<em>softmax</em>将这些得分进行归一化，将得分变成每个词的概率，然后选出概率最大的那个位置对应到词就是最后的输出了。</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/1566125533498.png" width="60%"></p>
<h1 id="4-Transformer在机器翻译中的应用"><a href="#4-Transformer在机器翻译中的应用" class="headerlink" title="4. Transformer在机器翻译中的应用"></a>4. Transformer在机器翻译中的应用</h1><p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/transformer_decoding_1.gif" alt="img"> </p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/transformer_decoding_2.gif" alt="img"></p>
<h1 id="5-关于K-Q-V的讨论"><a href="#5-关于K-Q-V的讨论" class="headerlink" title="5. 关于K, Q, V的讨论"></a>5. 关于K, Q, V的讨论</h1><p>到这里我们关于整个<em>Transformer</em>的介绍就结束了，我们先从整体上介绍了<em>Transformer</em>也是一个基于<em>encoder-decoder</em>的结构，然后抽丝剥茧般的一层一层的剥开模型，看看它的每一部分到底长什么样子，然后我们了解了每个零件之后又重新把每个零件组装回去。但是还是有两个问题我们可以再细致的讨论一下的，比如为什么需要$V$？为什么$K, Q$使用不同的权重获得？</p>
<h2 id="5-1-我们为什么需要V？"><a href="#5-1-我们为什么需要V？" class="headerlink" title="5.1 我们为什么需要V？"></a>5.1 我们为什么需要V？</h2><p>注意力权重矩阵可以表示序列中任意两个元素的相似性，但是不能用来表示原始的序列，因为它缺少了词向量。注意力的作用是给原始序列中的不同位置上的元素以不同的权重，这样可以更好的获取到这个句子中哪一部分重要哪一部分不那么重要，或者说对于句子中的某个词来说，哪个词对它更有依赖关系，哪些词跟它关系没那么密切。所以说，注意力有两个重要的组成部分，一个是注意力权重，也就是词与词之间的相似性，另一个就是原始的句子序列。从模型结构就可以看出来，$K, Q$是用来计算相似性的，那么$V$其实就是用来表征句子序列特征的。</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/1566186141115.png" alt></p>
<p>我们可以认为注意力权重矩阵是一个<em>过滤矩阵</em> ，把更多的注意力给重要的词，给那些不那么重要的词以更少的注意力。</p>
<h2 id="5-2-为什么使用两个不同的权重获得K-Q"><a href="#5-2-为什么使用两个不同的权重获得K-Q" class="headerlink" title="5.2 为什么使用两个不同的权重获得K, Q?"></a>5.2 为什么使用两个不同的权重获得K, Q?</h2><p>另一个问题就是，我们为什么要用两个不同的矩阵来获得$K, Q$？换句话说，就是我们为什么要用两个不同的矩阵来计算注意力呢？</p>
<p>正如我们前面所说的，注意力实际上是在计算两个词的相似度，如果使用相同的矩阵的话那就相当于计算自己与自己的相似度了，最后我们会得到一个对称矩阵，这样最后的模型的泛化性会大打折扣。</p>
<h2 id="5-3-Transformer是如何实现All-You-Need的？"><a href="#5-3-Transformer是如何实现All-You-Need的？" class="headerlink" title="5.3 Transformer是如何实现All You Need的？"></a>5.3 Transformer是如何实现<em>All You Need</em>的？</h2><p>回顾一下前一篇文章，我们介绍了各种各样的注意力机制：</p>
<ul>
<li>用于<em>seq2seq</em>的注意力机制</li>
<li>用于多语义理解的多维注意力机制</li>
<li>用于文本分类和语法纠正的层级注意力机制</li>
<li>用于阅读理解的基于记忆力的注意力机制</li>
<li>用于语言模型的自注意力机制</li>
<li>用于排序的指针网络</li>
<li>其他基于特定任务的注意力机制等等</li>
</ul>
<p>而<em>Transformer</em>本身就是基于<em>encoder-decoder</em>的结构，由于才用了<em>Multi-Head</em>这种类似多维度注意力机制，所以也能在多维度理解语义，另外由于本身是完全基于注意力的网络所以类层级注意力和类指针网络的的特性应该是<em>Transformer</em>的内秉属性。最后重要的两点：自注意力和基于记忆力的注意力机制在<em>Transformer</em>中表现尤为明显。所以说<em>Transformer</em>可以说是注意力机制的集大成者。</p>
<h1 id="6-代码实现（Pytorch）"><a href="#6-代码实现（Pytorch）" class="headerlink" title="6. 代码实现（Pytorch）"></a>6. 代码实现（Pytorch）</h1><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br><span class="line">83</span><br><span class="line">84</span><br><span class="line">85</span><br><span class="line">86</span><br><span class="line">87</span><br><span class="line">88</span><br><span class="line">89</span><br><span class="line">90</span><br><span class="line">91</span><br><span class="line">92</span><br><span class="line">93</span><br><span class="line">94</span><br><span class="line">95</span><br><span class="line">96</span><br><span class="line">97</span><br><span class="line">98</span><br><span class="line">99</span><br><span class="line">100</span><br><span class="line">101</span><br><span class="line">102</span><br><span class="line">103</span><br><span class="line">104</span><br><span class="line">105</span><br><span class="line">106</span><br><span class="line">107</span><br><span class="line">108</span><br><span class="line">109</span><br><span class="line">110</span><br><span class="line">111</span><br><span class="line">112</span><br><span class="line">113</span><br><span class="line">114</span><br><span class="line">115</span><br><span class="line">116</span><br><span class="line">117</span><br><span class="line">118</span><br><span class="line">119</span><br><span class="line">120</span><br><span class="line">121</span><br><span class="line">122</span><br><span class="line">123</span><br><span class="line">124</span><br><span class="line">125</span><br><span class="line">126</span><br><span class="line">127</span><br><span class="line">128</span><br><span class="line">129</span><br><span class="line">130</span><br><span class="line">131</span><br><span class="line">132</span><br><span class="line">133</span><br><span class="line">134</span><br><span class="line">135</span><br><span class="line">136</span><br><span class="line">137</span><br><span class="line">138</span><br><span class="line">139</span><br><span class="line">140</span><br><span class="line">141</span><br><span class="line">142</span><br><span class="line">143</span><br><span class="line">144</span><br><span class="line">145</span><br><span class="line">146</span><br><span class="line">147</span><br><span class="line">148</span><br><span class="line">149</span><br><span class="line">150</span><br><span class="line">151</span><br><span class="line">152</span><br><span class="line">153</span><br><span class="line">154</span><br><span class="line">155</span><br><span class="line">156</span><br><span class="line">157</span><br><span class="line">158</span><br><span class="line">159</span><br><span class="line">160</span><br><span class="line">161</span><br><span class="line">162</span><br><span class="line">163</span><br><span class="line">164</span><br><span class="line">165</span><br><span class="line">166</span><br><span class="line">167</span><br><span class="line">168</span><br><span class="line">169</span><br><span class="line">170</span><br><span class="line">171</span><br><span class="line">172</span><br><span class="line">173</span><br><span class="line">174</span><br><span class="line">175</span><br><span class="line">176</span><br><span class="line">177</span><br><span class="line">178</span><br><span class="line">179</span><br><span class="line">180</span><br><span class="line">181</span><br><span class="line">182</span><br><span class="line">183</span><br><span class="line">184</span><br><span class="line">185</span><br><span class="line">186</span><br><span class="line">187</span><br><span class="line">188</span><br><span class="line">189</span><br><span class="line">190</span><br><span class="line">191</span><br><span class="line">192</span><br><span class="line">193</span><br><span class="line">194</span><br><span class="line">195</span><br><span class="line">196</span><br><span class="line">197</span><br><span class="line">198</span><br><span class="line">199</span><br><span class="line">200</span><br><span class="line">201</span><br><span class="line">202</span><br><span class="line">203</span><br><span class="line">204</span><br><span class="line">205</span><br><span class="line">206</span><br><span class="line">207</span><br><span class="line">208</span><br><span class="line">209</span><br><span class="line">210</span><br><span class="line">211</span><br><span class="line">212</span><br><span class="line">213</span><br><span class="line">214</span><br><span class="line">215</span><br><span class="line">216</span><br><span class="line">217</span><br><span class="line">218</span><br><span class="line">219</span><br><span class="line">220</span><br><span class="line">221</span><br><span class="line">222</span><br><span class="line">223</span><br><span class="line">224</span><br><span class="line">225</span><br><span class="line">226</span><br><span class="line">227</span><br><span class="line">228</span><br><span class="line">229</span><br><span class="line">230</span><br><span class="line">231</span><br><span class="line">232</span><br><span class="line">233</span><br><span class="line">234</span><br><span class="line">235</span><br><span class="line">236</span><br><span class="line">237</span><br><span class="line">238</span><br><span class="line">239</span><br><span class="line">240</span><br><span class="line">241</span><br><span class="line">242</span><br><span class="line">243</span><br><span class="line">244</span><br><span class="line">245</span><br><span class="line">246</span><br><span class="line">247</span><br><span class="line">248</span><br><span class="line">249</span><br><span class="line">250</span><br><span class="line">251</span><br><span class="line">252</span><br><span class="line">253</span><br><span class="line">254</span><br><span class="line">255</span><br><span class="line">256</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> numpy <span class="keyword">as</span> np</span><br><span class="line"><span class="keyword">import</span> torch</span><br><span class="line"><span class="keyword">import</span> torch.nn <span class="keyword">as</span> nn</span><br><span class="line"><span class="keyword">import</span> torch.nn.functional <span class="keyword">as</span> F</span><br><span class="line"><span class="keyword">import</span> math, copy, time</span><br><span class="line"><span class="keyword">from</span> torch.autograd <span class="keyword">import</span> Variable</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">EncoderDecoder</span><span class="params">(nn.Module)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    A standard Encoder-Decoder architecture. Base for this and many </span></span><br><span class="line"><span class="string">    other models.</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, encoder, decoder, src_embed, tgt_embed, generator)</span>:</span></span><br><span class="line">        super(EncoderDecoder, self).__init__()</span><br><span class="line">        self.encoder = encoder</span><br><span class="line">        self.decoder = decoder</span><br><span class="line">        self.src_embed = src_embed</span><br><span class="line">        self.tgt_embed = tgt_embed</span><br><span class="line">        self.generator = generator</span><br><span class="line">        </span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">forward</span><span class="params">(self, src, tgt, src_mask, tgt_mask)</span>:</span></span><br><span class="line">        <span class="string">"Take in and process masked src and target sequences."</span></span><br><span class="line">        <span class="keyword">return</span> self.decode(self.encode(src, src_mask), src_mask,</span><br><span class="line">                            tgt, tgt_mask)</span><br><span class="line">    </span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">encode</span><span class="params">(self, src, src_mask)</span>:</span></span><br><span class="line">        <span class="keyword">return</span> self.encoder(self.src_embed(src), src_mask)</span><br><span class="line">    </span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">decode</span><span class="params">(self, memory, src_mask, tgt, tgt_mask)</span>:</span></span><br><span class="line">        <span class="keyword">return</span> self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)</span><br><span class="line">    </span><br><span class="line"> </span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">Generator</span><span class="params">(nn.Module)</span>:</span></span><br><span class="line">    <span class="string">"Define standard linear + softmax generation step."</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, d_model, vocab)</span>:</span></span><br><span class="line">        super(Generator, self).__init__()</span><br><span class="line">        self.proj = nn.Linear(d_model, vocab)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">forward</span><span class="params">(self, x)</span>:</span></span><br><span class="line">        <span class="keyword">return</span> F.log_softmax(self.proj(x), dim=<span class="number">-1</span>)</span><br><span class="line">    </span><br><span class="line">    </span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">clones</span><span class="params">(module, N)</span>:</span></span><br><span class="line">    <span class="string">"Produce N identical layers."</span></span><br><span class="line">    <span class="keyword">return</span> nn.ModuleList([copy.deepcopy(module) <span class="keyword">for</span> _ <span class="keyword">in</span> range(N)])</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">Encoder</span><span class="params">(nn.Module)</span>:</span></span><br><span class="line">    <span class="string">"Core encoder is a stack of N layers"</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, layer, N)</span>:</span></span><br><span class="line">        super(Encoder, self).__init__()</span><br><span class="line">        self.layers = clones(layer, N)</span><br><span class="line">        self.norm = LayerNorm(layer.size)</span><br><span class="line">        </span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">forward</span><span class="params">(self, x, mask)</span>:</span></span><br><span class="line">        <span class="string">"Pass the input (and mask) through each layer in turn."</span></span><br><span class="line">        <span class="keyword">for</span> layer <span class="keyword">in</span> self.layers:</span><br><span class="line">            x = layer(x, mask)</span><br><span class="line">        <span class="keyword">return</span> self.norm(x)</span><br><span class="line">    </span><br><span class="line">    </span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">LayerNorm</span><span class="params">(nn.Module)</span>:</span></span><br><span class="line">    <span class="string">"Construct a layernorm module (See citation for details)."</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, features, eps=<span class="number">1e-6</span>)</span>:</span></span><br><span class="line">        super(LayerNorm, self).__init__()</span><br><span class="line">        self.a_2 = nn.Parameter(torch.ones(features))</span><br><span class="line">        self.b_2 = nn.Parameter(torch.zeros(features))</span><br><span class="line">        self.eps = eps</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">forward</span><span class="params">(self, x)</span>:</span></span><br><span class="line">        mean = x.mean(<span class="number">-1</span>, keepdim=<span class="literal">True</span>)</span><br><span class="line">        std = x.std(<span class="number">-1</span>, keepdim=<span class="literal">True</span>)</span><br><span class="line">        <span class="keyword">return</span> self.a_2 * (x - mean) / (std + self.eps) + self.b_2</span><br><span class="line">    </span><br><span class="line">    </span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">SublayerConnection</span><span class="params">(nn.Module)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    A residual connection followed by a layer norm.</span></span><br><span class="line"><span class="string">    Note for code simplicity the norm is first as opposed to last.</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, size, dropout)</span>:</span></span><br><span class="line">        super(SublayerConnection, self).__init__()</span><br><span class="line">        self.norm = LayerNorm(size)</span><br><span class="line">        self.dropout = nn.Dropout(dropout)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">forward</span><span class="params">(self, x, sublayer)</span>:</span></span><br><span class="line">        <span class="string">"Apply residual connection to any sublayer with the same size."</span></span><br><span class="line">        <span class="keyword">return</span> x + self.dropout(sublayer(self.norm(x)))</span><br><span class="line">    </span><br><span class="line">    </span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">EncoderLayer</span><span class="params">(nn.Module)</span>:</span></span><br><span class="line">    <span class="string">"Encoder is made up of self-attn and feed forward (defined below)"</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, size, self_attn, feed_forward, dropout)</span>:</span></span><br><span class="line">        super(EncoderLayer, self).__init__()</span><br><span class="line">        self.self_attn = self_attn</span><br><span class="line">        self.feed_forward = feed_forward</span><br><span class="line">        self.sublayer = clones(SublayerConnection(size, dropout), <span class="number">2</span>)</span><br><span class="line">        self.size = size</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">forward</span><span class="params">(self, x, mask)</span>:</span></span><br><span class="line">        <span class="string">"Follow Figure 1 (left) for connections."</span></span><br><span class="line">        x = self.sublayer[<span class="number">0</span>](x, <span class="keyword">lambda</span> x: self.self_attn(x, x, x, mask))</span><br><span class="line">        <span class="keyword">return</span> self.sublayer[<span class="number">1</span>](x, self.feed_forward)</span><br><span class="line">    </span><br><span class="line">    </span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">Decoder</span><span class="params">(nn.Module)</span>:</span></span><br><span class="line">    <span class="string">"Generic N layer decoder with masking."</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, layer, N)</span>:</span></span><br><span class="line">        super(Decoder, self).__init__()</span><br><span class="line">        self.layers = clones(layer, N)</span><br><span class="line">        self.norm = LayerNorm(layer.size)</span><br><span class="line">        </span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">forward</span><span class="params">(self, x, memory, src_mask, tgt_mask)</span>:</span></span><br><span class="line">        <span class="keyword">for</span> layer <span class="keyword">in</span> self.layers:</span><br><span class="line">            x = layer(x, memory, src_mask, tgt_mask)</span><br><span class="line">        <span class="keyword">return</span> self.norm(x)</span><br><span class="line">    </span><br><span class="line">    </span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">DecoderLayer</span><span class="params">(nn.Module)</span>:</span></span><br><span class="line">    <span class="string">"Decoder is made of self-attn, src-attn, and feed forward (defined below)"</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, size, self_attn, src_attn, feed_forward, dropout)</span>:</span></span><br><span class="line">        super(DecoderLayer, self).__init__()</span><br><span class="line">        self.size = size</span><br><span class="line">        self.self_attn = self_attn</span><br><span class="line">        self.src_attn = src_attn</span><br><span class="line">        self.feed_forward = feed_forward</span><br><span class="line">        self.sublayer = clones(SublayerConnection(size, dropout), <span class="number">3</span>)</span><br><span class="line"> </span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">forward</span><span class="params">(self, x, memory, src_mask, tgt_mask)</span>:</span></span><br><span class="line">        <span class="string">"Follow Figure 1 (right) for connections."</span></span><br><span class="line">        m = memory</span><br><span class="line">        x = self.sublayer[<span class="number">0</span>](x, <span class="keyword">lambda</span> x: self.self_attn(x, x, x, tgt_mask))</span><br><span class="line">        x = self.sublayer[<span class="number">1</span>](x, <span class="keyword">lambda</span> x: self.src_attn(x, m, m, src_mask))</span><br><span class="line">        <span class="keyword">return</span> self.sublayer[<span class="number">2</span>](x, self.feed_forward)</span><br><span class="line">    </span><br><span class="line">    </span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">subsequent_mask</span><span class="params">(size)</span>:</span></span><br><span class="line">    <span class="string">"Mask out subsequent positions."</span></span><br><span class="line">    attn_shape = (<span class="number">1</span>, size, size)</span><br><span class="line">    subsequent_mask = np.triu(np.ones(attn_shape), k=<span class="number">1</span>).astype(<span class="string">'uint8'</span>)</span><br><span class="line">    <span class="keyword">return</span> torch.from_numpy(subsequent_mask) == <span class="number">0</span></span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">attention</span><span class="params">(query, key, value, mask=None, dropout=None)</span>:</span></span><br><span class="line">    <span class="string">"Compute 'Scaled Dot Product Attention'"</span></span><br><span class="line">    d_k = query.size(<span class="number">-1</span>)</span><br><span class="line">    scores = torch.matmul(query, key.transpose(<span class="number">-2</span>, <span class="number">-1</span>)) \</span><br><span class="line">             / math.sqrt(d_k)</span><br><span class="line">    <span class="keyword">if</span> mask <span class="keyword">is</span> <span class="keyword">not</span> <span class="literal">None</span>:</span><br><span class="line">        scores = scores.masked_fill(mask == <span class="number">0</span>, <span class="number">-1e9</span>)</span><br><span class="line">    p_attn = F.softmax(scores, dim = <span class="number">-1</span>)</span><br><span class="line">    <span class="keyword">if</span> dropout <span class="keyword">is</span> <span class="keyword">not</span> <span class="literal">None</span>:</span><br><span class="line">        p_attn = dropout(p_attn)</span><br><span class="line">    <span class="keyword">return</span> torch.matmul(p_attn, value), p_attn</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">MultiHeadedAttention</span><span class="params">(nn.Module)</span>:</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, h, d_model, dropout=<span class="number">0.1</span>)</span>:</span></span><br><span class="line">        <span class="string">"Take in model size and number of heads."</span></span><br><span class="line">        super(MultiHeadedAttention, self).__init__()</span><br><span class="line">        <span class="keyword">assert</span> d_model % h == <span class="number">0</span></span><br><span class="line">        <span class="comment"># We assume d_v always equals d_k</span></span><br><span class="line">        self.d_k = d_model // h</span><br><span class="line">        self.h = h</span><br><span class="line">        self.linears = clones(nn.Linear(d_model, d_model), <span class="number">4</span>)</span><br><span class="line">        self.attn = <span class="literal">None</span></span><br><span class="line">        self.dropout = nn.Dropout(p=dropout)</span><br><span class="line">        </span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">forward</span><span class="params">(self, query, key, value, mask=None)</span>:</span></span><br><span class="line">        <span class="string">"Implements Figure 2"</span></span><br><span class="line">        <span class="keyword">if</span> mask <span class="keyword">is</span> <span class="keyword">not</span> <span class="literal">None</span>:</span><br><span class="line">            <span class="comment"># Same mask applied to all h heads.</span></span><br><span class="line">            mask = mask.unsqueeze(<span class="number">1</span>)</span><br><span class="line">        nbatches = query.size(<span class="number">0</span>)</span><br><span class="line">        </span><br><span class="line">        <span class="comment"># 1) Do all the linear projections in batch from d_model =&gt; h x d_k </span></span><br><span class="line">        query, key, value = \</span><br><span class="line">            [l(x).view(nbatches, <span class="number">-1</span>, self.h, self.d_k).transpose(<span class="number">1</span>, <span class="number">2</span>)</span><br><span class="line">             <span class="keyword">for</span> l, x <span class="keyword">in</span> zip(self.linears, (query, key, value))]</span><br><span class="line">        </span><br><span class="line">        <span class="comment"># 2) Apply attention on all the projected vectors in batch. </span></span><br><span class="line">        x, self.attn = attention(query, key, value, mask=mask, </span><br><span class="line">                                 dropout=self.dropout)</span><br><span class="line">        </span><br><span class="line">        <span class="comment"># 3) "Concat" using a view and apply a final linear. </span></span><br><span class="line">        x = x.transpose(<span class="number">1</span>, <span class="number">2</span>).contiguous() \</span><br><span class="line">             .view(nbatches, <span class="number">-1</span>, self.h * self.d_k)</span><br><span class="line">        <span class="keyword">return</span> self.linears[<span class="number">-1</span>](x)</span><br><span class="line">    </span><br><span class="line">    </span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">PositionwiseFeedForward</span><span class="params">(nn.Module)</span>:</span></span><br><span class="line">    <span class="string">"Implements FFN equation."</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, d_model, d_ff, dropout=<span class="number">0.1</span>)</span>:</span></span><br><span class="line">        super(PositionwiseFeedForward, self).__init__()</span><br><span class="line">        self.w_1 = nn.Linear(d_model, d_ff)</span><br><span class="line">        self.w_2 = nn.Linear(d_ff, d_model)</span><br><span class="line">        self.dropout = nn.Dropout(dropout)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">forward</span><span class="params">(self, x)</span>:</span></span><br><span class="line">        <span class="keyword">return</span> self.w_2(self.dropout(F.relu(self.w_1(x))))</span><br><span class="line">    </span><br><span class="line">    </span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">Embeddings</span><span class="params">(nn.Module)</span>:</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, d_model, vocab)</span>:</span></span><br><span class="line">        super(Embeddings, self).__init__()</span><br><span class="line">        self.lut = nn.Embedding(vocab, d_model)</span><br><span class="line">        self.d_model = d_model</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">forward</span><span class="params">(self, x)</span>:</span></span><br><span class="line">        <span class="keyword">return</span> self.lut(x) * math.sqrt(self.d_model)</span><br><span class="line">    </span><br><span class="line">    </span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">PositionalEncoding</span><span class="params">(nn.Module)</span>:</span></span><br><span class="line">    <span class="string">"Implement the PE function."</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, d_model, dropout, max_len=<span class="number">5000</span>)</span>:</span></span><br><span class="line">        super(PositionalEncoding, self).__init__()</span><br><span class="line">        self.dropout = nn.Dropout(p=dropout)</span><br><span class="line">        </span><br><span class="line">        <span class="comment"># Compute the positional encodings once in log space.</span></span><br><span class="line">        pe = torch.zeros(max_len, d_model)</span><br><span class="line">        position = torch.arange(<span class="number">0</span>, max_len).unsqueeze(<span class="number">1</span>)</span><br><span class="line">        div_term = torch.exp(torch.arange(<span class="number">0</span>, d_model, <span class="number">2</span>) *</span><br><span class="line">                             -(math.log(<span class="number">10000.0</span>) / d_model))</span><br><span class="line">        pe[:, <span class="number">0</span>::<span class="number">2</span>] = torch.sin(position * div_term)</span><br><span class="line">        pe[:, <span class="number">1</span>::<span class="number">2</span>] = torch.cos(position * div_term)</span><br><span class="line">        pe = pe.unsqueeze(<span class="number">0</span>)</span><br><span class="line">        self.register_buffer(<span class="string">'pe'</span>, pe)</span><br><span class="line">        </span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">forward</span><span class="params">(self, x)</span>:</span></span><br><span class="line">        x = x + Variable(self.pe[:, :x.size(<span class="number">1</span>)], </span><br><span class="line">                         requires_grad=<span class="literal">False</span>)</span><br><span class="line">        <span class="keyword">return</span> self.dropout(x)</span><br><span class="line">    </span><br><span class="line">    </span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">make_model</span><span class="params">(src_vocab, tgt_vocab, N=<span class="number">6</span>, </span></span></span><br><span class="line"><span class="function"><span class="params">               d_model=<span class="number">512</span>, d_ff=<span class="number">2048</span>, h=<span class="number">8</span>, dropout=<span class="number">0.1</span>)</span>:</span></span><br><span class="line">    <span class="string">"Helper: Construct a model from hyperparameters."</span></span><br><span class="line">    c = copy.deepcopy</span><br><span class="line">    attn = MultiHeadedAttention(h, d_model)</span><br><span class="line">    ff = PositionwiseFeedForward(d_model, d_ff, dropout)</span><br><span class="line">    position = PositionalEncoding(d_model, dropout)</span><br><span class="line">    model = EncoderDecoder(</span><br><span class="line">        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),</span><br><span class="line">        Decoder(DecoderLayer(d_model, c(attn), c(attn), </span><br><span class="line">                             c(ff), dropout), N),</span><br><span class="line">        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),</span><br><span class="line">        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),</span><br><span class="line">        Generator(d_model, tgt_vocab))</span><br><span class="line">    </span><br><span class="line">    <span class="comment"># This was important from their code. </span></span><br><span class="line">    <span class="comment"># Initialize parameters with Glorot / fan_avg.</span></span><br><span class="line">    <span class="keyword">for</span> p <span class="keyword">in</span> model.parameters():</span><br><span class="line">        <span class="keyword">if</span> p.dim() &gt; <span class="number">1</span>:</span><br><span class="line">            nn.init.xavier_uniform(p)</span><br><span class="line">    <span class="keyword">return</span> model</span><br></pre></td></tr></table></figure>
<h1 id="7-参考资料"><a href="#7-参考资料" class="headerlink" title="7. 参考资料"></a>7. 参考资料</h1><ol>
<li><a href="https://arxiv.org/pdf/1706.03762.pdf" target="_blank" rel="noopener">Attention is all you need</a></li>
<li><a href="https://jalammar.github.io/illustrated-transformer/" target="_blank" rel="noopener">The Illustrated Transformer</a></li>
<li><a href="http://super1peng.xyz/2018/11/26/Attention-is-All-You-Need/?nsukey=vdt8WL9eYHSqq%2F005akltYu4igB%2BuTF%2B2KWPULUX1nn8k91eO9sr%2BChTyLJXQ37Au2eWcaYldRdlOfl8pIQyv9ppfGptFfwwtw0efEQJ33aGvuOKBMUFWJPNFIVeVRYNqnUtVUSrIjo7nWkkOMH%2B%2FXGP%2BMomk8lN%2BVxi2o6ynULt3bVzxCufGq1rAYv8Q52P3ZN6poCGdX3jQTGpaLAt3g%3D%3D" target="_blank" rel="noopener">细讲 | Attention Is All You Need</a></li>
<li><a href="https://spaces.ac.cn/archives/4765" target="_blank" rel="noopener">《Attention is All You Need》浅读（简介+代码）</a></li>
<li><a href="http://nlp.seas.harvard.edu/2018/04/03/attention.html" target="_blank" rel="noopener">The Annotated Transformer</a></li>
<li><a href="https://arxiv.org/abs/1705.03122" target="_blank" rel="noopener">Convolutional Sequence to Sequence Learning</a></li>
</ol>

        </div>
        
          


  <section class='meta' id="footer-meta">
    <hr>
    <div class='new-meta-box'>
      
        
          <div class="new-meta-item date" itemprop="dateUpdated" datetime="2021-08-23T01:05:27+08:00">
  <a class='notlink'>
    <i class="fas fa-clock" aria-hidden="true"></i>
    <p>最后更新于 2021年8月23日</p>
  </a>
</div>

        
      
        
          
  
  <div class="new-meta-item meta-tags"><a class="tag" href="/tags/transformer/" rel="nofollow"><i class="fas fa-hashtag" aria-hidden="true"></i>&nbsp;<p>Transformer</p></a></div> <div class="new-meta-item meta-tags"><a class="tag" href="/tags/attention/" rel="nofollow"><i class="fas fa-hashtag" aria-hidden="true"></i>&nbsp;<p>Attention</p></a></div>


        
      
        
          
  <div class="new-meta-item share -mob-share-list">
  <div class="-mob-share-list share-body">
    
      
        <a class="-mob-share-qq" title="QQ好友" rel="external nofollow noopener noreferrer"
          
          href="http://connect.qq.com/widget/shareqq/index.html?url=https://rogerspy.gitee.io/2019/08/27/NLP中的注意力机制简介（二）/&title=NLP中的注意力机制简介（二） | Rogerspy's Home&summary=
    ——Transformer专题篇




1. 前言之前我们介绍了各种各样的注意力机制，如果仔细回想一下就可以发现无论是哪种注意力机制都不是单独出现的，都是伴随着RNN或者其他RNN的变种。这种基于RNN的注意力机制会面临一个问题就是，难以处理长序列的句子，因为无法实现并行计算，所以非常消耗计算资源。"
          
          >
          
            <img src="https://cdn.jsdelivr.net/gh/xaoxuu/assets@19.1.9/logo/128/qq.png">
          
        </a>
      
    
      
        <a class="-mob-share-qzone" title="QQ空间" rel="external nofollow noopener noreferrer"
          
          href="https://sns.qzone.qq.com/cgi-bin/qzshare/cgi_qzshare_onekey?url=https://rogerspy.gitee.io/2019/08/27/NLP中的注意力机制简介（二）/&title=NLP中的注意力机制简介（二） | Rogerspy's Home&summary=
    ——Transformer专题篇




1. 前言之前我们介绍了各种各样的注意力机制，如果仔细回想一下就可以发现无论是哪种注意力机制都不是单独出现的，都是伴随着RNN或者其他RNN的变种。这种基于RNN的注意力机制会面临一个问题就是，难以处理长序列的句子，因为无法实现并行计算，所以非常消耗计算资源。"
          
          >
          
            <img src="https://cdn.jsdelivr.net/gh/xaoxuu/assets@19.1.9/logo/128/qzone.png">
          
        </a>
      
    
      
        <a class='qrcode' rel="external nofollow noopener noreferrer" href=''>
        
          <img src="https://cdn.jsdelivr.net/gh/xaoxuu/assets@19.1.9/logo/128/wechat.png">
        
        </a>
      
    
      
        <a class="-mob-share-weibo" title="微博" rel="external nofollow noopener noreferrer"
          
          href="http://service.weibo.com/share/share.php?url=https://rogerspy.gitee.io/2019/08/27/NLP中的注意力机制简介（二）/&title=NLP中的注意力机制简介（二） | Rogerspy's Home&summary=
    ——Transformer专题篇




1. 前言之前我们介绍了各种各样的注意力机制，如果仔细回想一下就可以发现无论是哪种注意力机制都不是单独出现的，都是伴随着RNN或者其他RNN的变种。这种基于RNN的注意力机制会面临一个问题就是，难以处理长序列的句子，因为无法实现并行计算，所以非常消耗计算资源。"
          
          >
          
            <img src="https://cdn.jsdelivr.net/gh/xaoxuu/assets@19.1.9/logo/128/weibo.png">
          
        </a>
      
    
  </div>
</div>



        
      
    </div>
  </section>


        
        
            <div class="prev-next">
                
                    <section class="prev">
                        <span class="art-item-left">
                            <h6><i class="fas fa-chevron-left" aria-hidden="true"></i>&nbsp;上一页</h6>
                            <h4>
                                <a href="/2019/09/01/analyse-transformer/" rel="prev" title="关于Transformer的分析">
                                  
                                      关于Transformer的分析
                                  
                                </a>
                            </h4>
                            
                                
                                <h6 class="tags">
                                    <a class="tag" href="/tags/transformer/"><i class="fas fa-hashtag fa-fw" aria-hidden="true"></i>Transformer</a> <a class="tag" href="/tags/attention/"><i class="fas fa-hashtag fa-fw" aria-hidden="true"></i>Attention</a>
                                </h6>
                            
                        </span>
                    </section>
                
                
                    <section class="next">
                        <span class="art-item-right" aria-hidden="true">
                            <h6>下一页&nbsp;<i class="fas fa-chevron-right" aria-hidden="true"></i></h6>
                            <h4>
                                <a href="/2019/08/26/NLP中的注意力机制简介（一）/" rel="prev" title="NLP中的注意力机制简介（一）">
                                    
                                        NLP中的注意力机制简介（一）
                                    
                                </a>
                            </h4>
                            
                                
                                <h6 class="tags">
                                    <a class="tag" href="/tags/attention/"><i class="fas fa-hashtag fa-fw" aria-hidden="true"></i>Attention</a>
                                </h6>
                            
                        </span>
                    </section>
                
            </div>
        
      </section>
    </article>
  

  
    <!-- 显示推荐文章和评论 -->



  <article class="post white-box comments">
    <section class="article typo">
      <h4><i class="fas fa-comments fa-fw" aria-hidden="true"></i>&nbsp;评论</h4>
      
      
      
        <section id="comments">
          <div id="gitalk-container"></div>
        </section>
      
      
    </section>
  </article>


  




<!-- 根据页面mathjax变量决定是否加载MathJax数学公式js -->

  <!-- MathJax配置，可通过单美元符号书写行内公式等 -->
<script type="text/x-mathjax-config">
  MathJax.Hub.Config({
    "HTML-CSS": {
      preferredFont: "TeX",
      availableFonts: ["STIX","TeX"],
      linebreaks: { automatic:true },
      EqnChunk: (MathJax.Hub.Browser.isMobile ? 10 : 50)
    },
    tex2jax: {
      inlineMath: [ ["$", "$"], ["\\(","\\)"] ],
      processEscapes: true,
      ignoreClass: "tex2jax_ignore|dno",
      skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
    },
    TeX: {
      equationNumbers: { autoNumber: "AMS" },
      noUndefined: { attributes: { mathcolor: "red", mathbackground: "#FFEEEE", mathsize: "90%" } },
      Macros: { href: "{}" }
    },
    messageStyle: "none"
  });
</script>
<!-- 给MathJax元素添加has-jax class -->
<script type="text/x-mathjax-config">
  MathJax.Hub.Queue(function() {
    var all = MathJax.Hub.getAllJax(), i;
    for(i=0; i < all.length; i += 1) {
      all[i].SourceElement().parentNode.className += (all[i].SourceElement().parentNode.className ? ' ' : '') + 'has-jax';
    }
  });
</script>
<!-- 通过连接CDN加载MathJax的js代码 -->
<script type="text/javascript" async
  src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML">
</script>




  <script>
    window.subData = {
      title: 'NLP中的注意力机制简介（二）',
      tools: true
    }
  </script>


</div>
<aside class='l_side'>
  
    
    
      
        
          
          
            <section class='widget shake author'>
  <div class='content pure'>
    
      <div class='avatar'>
        <img class='avatar' src='https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/65-1Z31313530JC.jpeg'/>
      </div>
    
    
    
      <div class="social-wrapper">
        
          
            <a href="/atom.xml"
              class="social fas fa-rss flat-btn"
              target="_blank"
              rel="external nofollow noopener noreferrer">
            </a>
          
        
          
            <a href="mailto:rogerspy@163.com"
              class="social fas fa-envelope flat-btn"
              target="_blank"
              rel="external nofollow noopener noreferrer">
            </a>
          
        
          
            <a href="https://github.com/rogerspy"
              class="social fab fa-github flat-btn"
              target="_blank"
              rel="external nofollow noopener noreferrer">
            </a>
          
        
          
            <a href="https://music.163.com/#/user/home?id=1960721923"
              class="social fas fa-headphones-alt flat-btn"
              target="_blank"
              rel="external nofollow noopener noreferrer">
            </a>
          
        
      </div>
    
  </div>
</section>

          
        
      
        
          
          
            
  <section class='widget toc-wrapper'>
    
<header class='pure'>
  <div><i class="fas fa-list fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;本文目录</div>
  
    <div class='wrapper'><a class="s-toc rightBtn" rel="external nofollow noopener noreferrer" href="javascript:void(0)"><i class="fas fa-thumbtack fa-fw"></i></a></div>
  
</header>

    <div class='content pure'>
      <ol class="toc"><li class="toc-item toc-level-1"><a class="toc-link" href="#null"><span class="toc-text">
    ——Transformer专题篇
</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#1-前言"><span class="toc-text">1. 前言</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#2-模型结构"><span class="toc-text">2. 模型结构</span></a><ol class="toc-child"><li class="toc-item toc-level-2"><a class="toc-link" href="#2-1-模型结构总览"><span class="toc-text">2.1 模型结构总览</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#2-2-Multi-Head-Attention"><span class="toc-text">2.2 Multi-Head Attention</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#2-3-Scaled-Dot-Product-Attention"><span class="toc-text">2.3 Scaled-Dot Product Attention</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#2-4-矩阵计算-Scaled-Dot-Product-Attention"><span class="toc-text">2.4 矩阵计算 Scaled-Dot Product Attention</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#2-5-矩阵计算-Multi-Head"><span class="toc-text">2.5 矩阵计算 Multi-Head</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#2-6-Position-Encoding"><span class="toc-text">2.6 Position Encoding</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#2-6-残差结构"><span class="toc-text">2.6 残差结构</span></a></li></ol></li><li class="toc-item toc-level-1"><a class="toc-link" href="#3-模型组装"><span class="toc-text">3. 模型组装</span></a><ol class="toc-child"><li class="toc-item toc-level-2"><a class="toc-link" href="#3-1-Encoder"><span class="toc-text">3.1 Encoder</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#3-2-Decoder"><span class="toc-text">3.2 Decoder</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#3-3-最后的Linear层和Softmax层"><span class="toc-text">3.3 最后的Linear层和Softmax层</span></a></li></ol></li><li class="toc-item toc-level-1"><a class="toc-link" href="#4-Transformer在机器翻译中的应用"><span class="toc-text">4. Transformer在机器翻译中的应用</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#5-关于K-Q-V的讨论"><span class="toc-text">5. 关于K, Q, V的讨论</span></a><ol class="toc-child"><li class="toc-item toc-level-2"><a class="toc-link" href="#5-1-我们为什么需要V？"><span class="toc-text">5.1 我们为什么需要V？</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#5-2-为什么使用两个不同的权重获得K-Q"><span class="toc-text">5.2 为什么使用两个不同的权重获得K, Q?</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#5-3-Transformer是如何实现All-You-Need的？"><span class="toc-text">5.3 Transformer是如何实现All You Need的？</span></a></li></ol></li><li class="toc-item toc-level-1"><a class="toc-link" href="#6-代码实现（Pytorch）"><span class="toc-text">6. 代码实现（Pytorch）</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#7-参考资料"><span class="toc-text">7. 参考资料</span></a></li></ol>
    </div>
  </section>


          
        
      
        
          
          
            <section class='widget grid'>
  
<header class='pure'>
  <div><i class="fas fa-map-signs fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;站内导航</div>
  
</header>

  <div class='content pure'>
    <ul class="grid navgation">
      
        <li><a class="flat-box" " href="/"
          
          
          id="home">
          
            <i class="fas fa-clock fa-fw" aria-hidden="true"></i>
          
          近期文章
        </a></li>
      
        <li><a class="flat-box" " href="/blog/"
          
          
          id="blog">
          
            <i class="fas fa-edit fa-fw" aria-hidden="true"></i>
          
          我的博客
        </a></li>
      
        <li><a class="flat-box" " href="/paper_note/"
          
          
          id="paper_note">
          
            <i class="fas fa-book fa-fw" aria-hidden="true"></i>
          
          论文笔记
        </a></li>
      
        <li><a class="flat-box" " href="/algorithm/"
          
          
          id="algorithm">
          
            <i class="fas fa-cube fa-fw" aria-hidden="true"></i>
          
          算法基础
        </a></li>
      
        <li><a class="flat-box" " href="/leetcode/"
          
          
          id="leetcode">
          
            <i class="fas fa-code fa-fw" aria-hidden="true"></i>
          
          Leetcode
        </a></li>
      
        <li><a class="flat-box" " href="/video/"
          
          
          id="video">
          
            <i class="fas fa-film fa-fw" aria-hidden="true"></i>
          
          视频小站
        </a></li>
      
        <li><a class="flat-box" " href="/material/"
          
          
          id="material">
          
            <i class="fas fa-briefcase fa-fw" aria-hidden="true"></i>
          
          学习资料
        </a></li>
      
        <li><a class="flat-box" " href="/dataset/"
          
          
          id="dataset">
          
            <i class="fas fa-database fa-fw" aria-hidden="true"></i>
          
          数据集
        </a></li>
      
        <li><a class="flat-box" " href="/articles/"
          
          
          id="articles">
          
            <i class="fas fa-sticky-note fa-fw" aria-hidden="true"></i>
          
          杂文天地
        </a></li>
      
        <li><a class="flat-box" " href="/blog/archives/"
          
            rel="nofollow"
          
          
          id="blogarchives">
          
            <i class="fas fa-archive fa-fw" aria-hidden="true"></i>
          
          文章归档
        </a></li>
      
        <li><a class="flat-box" " href="/personal_center/"
          
          
          id="personal_center">
          
            <i class="fas fa-university fa-fw" aria-hidden="true"></i>
          
          个人中心
        </a></li>
      
        <li><a class="flat-box" " href="/about/"
          
            rel="nofollow"
          
          
          id="about">
          
            <i class="fas fa-info-circle fa-fw" aria-hidden="true"></i>
          
          关于小站
        </a></li>
      
    </ul>
  </div>
</section>

          
        
      
        
          
          
            <section class='widget list'>
  
<header class='pure'>
  <div><i class="fas fa-terminal fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;机器学习框架</div>
  
</header>

  <div class='content pure'>
    <ul class="entry">
      
        <li><a class="flat-box" title="https://rogerspy.gitee.io/pytorch-zh/" href="https://rogerspy.gitee.io/pytorch-zh/"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-star fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;PyTorch 中文文档
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="https://keras-zh.readthedocs.io/" href="https://keras-zh.readthedocs.io/"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-star fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Keras 中文文档
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="https://tensorflow.google.cn/" href="https://tensorflow.google.cn/"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-star fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Tensorflow 中文文档
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="http://scikitlearn.com.cn/" href="http://scikitlearn.com.cn/"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-star fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Scikit Learn 中文文档
          </div>
          
        </a></li>
      
    </ul>
  </div>
</section>

          
        
      
        
          
          
            <section class='widget list'>
  
<header class='pure'>
  <div><i class="fas fa-wrench fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;百宝箱</div>
  
</header>

  <div class='content pure'>
    <ul class="entry">
      
        <li><a class="flat-box" title="https://rogerspy.github.io/excalidraw-claymate/" href="https://rogerspy.github.io/excalidraw-claymate/"
          
          
            target="_blank"
          
          >
          <div class='name'>
            
              <i class="fas fa-magic fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Excalidraw-Claymate
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="https://rogerspy.github.io/jupyterlite/" href="https://rogerspy.github.io/jupyterlite/"
          
          
            target="_blank"
          
          >
          <div class='name'>
            
              <i class="fas fa-terminal fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;JupyterLite
          </div>
          
        </a></li>
      
    </ul>
  </div>
</section>

          
        
      
        
          
          
            <section class='widget list'>
  
<header class='pure'>
  <div><i class="fas fa-eye fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;睁眼看世界</div>
  
</header>

  <div class='content pure'>
    <ul class="entry">
      
        <li><a class="flat-box" title="https://deeplearn.org/" href="https://deeplearn.org/"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-link fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Deep Learning Monitor
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="https://paperswithcode.com/sota" href="https://paperswithcode.com/sota"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-link fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Browse State-of-the-Art
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="https://huggingface.co/transformers/" href="https://huggingface.co/transformers/"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-link fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Transformers
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="https://huggingface.co/models" href="https://huggingface.co/models"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-link fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Transformers-models
          </div>
          
        </a></li>
      
    </ul>
  </div>
</section>

          
        
      
        
          
          
            
  <section class='widget category'>
    
<header class='pure'>
  <div><i class="fas fa-folder-open fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;文章分类</div>
  
    <a class="rightBtn"
    
      rel="nofollow"
    
    
    href="/categories/"
    title="categories/">
    <i class="fas fa-expand-arrows-alt fa-fw"></i></a>
  
</header>

    <div class='content pure'>
      <ul class="entry">
        
          <li><a class="flat-box" title="/categories/nl2sql/" href="/categories/nl2sql/"><div class='name'>NL2SQL</div><div class='badge'>(1)</div></a></li>
        
          <li><a class="flat-box" title="/categories/nlp/" href="/categories/nlp/"><div class='name'>NLP</div><div class='badge'>(23)</div></a></li>
        
          <li><a class="flat-box" title="/categories/博客转载/" href="/categories/博客转载/"><div class='name'>博客转载</div><div class='badge'>(5)</div></a></li>
        
          <li><a class="flat-box" title="/categories/数据结构与算法/" href="/categories/数据结构与算法/"><div class='name'>数据结构与算法</div><div class='badge'>(11)</div></a></li>
        
          <li><a class="flat-box" title="/categories/知识图谱/" href="/categories/知识图谱/"><div class='name'>知识图谱</div><div class='badge'>(3)</div></a></li>
        
          <li><a class="flat-box" title="/categories/论文解读/" href="/categories/论文解读/"><div class='name'>论文解读</div><div class='badge'>(2)</div></a></li>
        
          <li><a class="flat-box" title="/categories/语言模型/" href="/categories/语言模型/"><div class='name'>语言模型</div><div class='badge'>(10)</div></a></li>
        
      </ul>
    </div>
  </section>


          
        
      
        
          
          
            
  <section class='widget tagcloud'>
    
<header class='pure'>
  <div><i class="fas fa-fire fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;热门标签</div>
  
    <a class="rightBtn"
    
      rel="nofollow"
    
    
    href="/tags/"
    title="tags/">
    <i class="fas fa-expand-arrows-alt fa-fw"></i></a>
  
</header>

    <div class='content pure'>
      <a href="/tags/attention/" style="font-size: 16.86px; color: #868686">Attention</a> <a href="/tags/cnnlm/" style="font-size: 14px; color: #999">CNNLM</a> <a href="/tags/data-structure/" style="font-size: 14px; color: #999">Data Structure</a> <a href="/tags/deep/" style="font-size: 14px; color: #999">Deep</a> <a href="/tags/ffnnlm/" style="font-size: 14px; color: #999">FFNNLM</a> <a href="/tags/gaussian/" style="font-size: 14px; color: #999">Gaussian</a> <a href="/tags/initialization/" style="font-size: 14px; color: #999">Initialization</a> <a href="/tags/kg/" style="font-size: 16.86px; color: #868686">KG</a> <a href="/tags/lstm/" style="font-size: 14px; color: #999">LSTM</a> <a href="/tags/lstmlm/" style="font-size: 14px; color: #999">LSTMLM</a> <a href="/tags/language-model/" style="font-size: 16.86px; color: #868686">Language Model</a> <a href="/tags/log-linear-language-model/" style="font-size: 14px; color: #999">Log-Linear Language Model</a> <a href="/tags/nlp/" style="font-size: 19.71px; color: #727272">NLP</a> <a href="/tags/nmt/" style="font-size: 22.57px; color: #5f5f5f">NMT</a> <a href="/tags/norm/" style="font-size: 14px; color: #999">Norm</a> <a href="/tags/probabilistic-language-model/" style="font-size: 14px; color: #999">Probabilistic Language Model</a> <a href="/tags/rnnlm/" style="font-size: 14px; color: #999">RNNLM</a> <a href="/tags/roc-auc/" style="font-size: 14px; color: #999">ROC-AUC</a> <a href="/tags/transformer/" style="font-size: 24px; color: #555">Transformer</a> <a href="/tags/context2vec/" style="font-size: 14px; color: #999">context2vec</a> <a href="/tags/divide-conquer/" style="font-size: 14px; color: #999">divide-conquer</a> <a href="/tags/insertion/" style="font-size: 16.86px; color: #868686">insertion</a> <a href="/tags/insertion-deletion/" style="font-size: 15.43px; color: #8f8f8f">insertion-deletion</a> <a href="/tags/knowledge-modelling/" style="font-size: 15.43px; color: #8f8f8f">knowledge-modelling</a> <a href="/tags/nl2infographic/" style="font-size: 14px; color: #999">nl2infographic</a> <a href="/tags/nl2sql/" style="font-size: 14px; color: #999">nl2sql</a> <a href="/tags/ontology/" style="font-size: 14px; color: #999">ontology</a> <a href="/tags/parallel-recurrent/" style="font-size: 14px; color: #999">parallel-recurrent</a> <a href="/tags/pytorch/" style="font-size: 14px; color: #999">pytorch</a> <a href="/tags/queue/" style="font-size: 18.29px; color: #7c7c7c">queue</a> <a href="/tags/sparse/" style="font-size: 14px; color: #999">sparse</a> <a href="/tags/stack/" style="font-size: 14px; color: #999">stack</a> <a href="/tags/tensorflow/" style="font-size: 14px; color: #999">tensorflow</a> <a href="/tags/text2viz/" style="font-size: 14px; color: #999">text2viz</a> <a href="/tags/weighted-head/" style="font-size: 14px; color: #999">weighted-head</a> <a href="/tags/半监督语言模型/" style="font-size: 14px; color: #999">半监督语言模型</a> <a href="/tags/双数组前缀树/" style="font-size: 14px; color: #999">双数组前缀树</a> <a href="/tags/推荐系统/" style="font-size: 14px; color: #999">推荐系统</a> <a href="/tags/数据结构/" style="font-size: 21.14px; color: #686868">数据结构</a> <a href="/tags/数组/" style="font-size: 14px; color: #999">数组</a> <a href="/tags/时间复杂度/" style="font-size: 14px; color: #999">时间复杂度</a> <a href="/tags/算法/" style="font-size: 14px; color: #999">算法</a> <a href="/tags/评估方法/" style="font-size: 14px; color: #999">评估方法</a> <a href="/tags/词向量/" style="font-size: 14px; color: #999">词向量</a> <a href="/tags/隐式正则化/" style="font-size: 14px; color: #999">隐式正则化</a>
    </div>
  </section>


          
        
      
        
          
          
            


  <section class='widget music'>
    
<header class='pure'>
  <div><i class="fas fa-compact-disc fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;最近在听</div>
  
    <a class="rightBtn"
    
      rel="external nofollow noopener noreferrer"
    
    
      target="_blank"
    
    href="https://music.163.com/#/user/home?id=1960721923"
    title="https://music.163.com/#/user/home?id=1960721923">
    <i class="far fa-heart fa-fw"></i></a>
  
</header>

    <div class='content pure'>
      
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/aplayer@1.7.0/dist/APlayer.min.css">
  <div class="aplayer"
    data-theme="#1BCDFC"
    
    
    data-mode="circulation"
    data-server="netease"
    data-type="playlist"
    data-id="2957571193"
    data-volume="0.7">
  </div>
  <script src="https://cdn.jsdelivr.net/npm/aplayer@1.7.0/dist/APlayer.min.js"></script>
  <script src="https://cdn.jsdelivr.net/npm/meting@1.1.0/dist/Meting.min.js"></script>


    </div>
  </section>


          
        
      
    

  
</aside>

<footer id="footer" class="clearfix">
  <div id="sitetime"></div>
  
  
    <div class="social-wrapper">
      
        
          <a href="/atom.xml"
            class="social fas fa-rss flat-btn"
            target="_blank"
            rel="external nofollow noopener noreferrer">
          </a>
        
      
        
          <a href="mailto:rogerspy@163.com"
            class="social fas fa-envelope flat-btn"
            target="_blank"
            rel="external nofollow noopener noreferrer">
          </a>
        
      
        
          <a href="https://github.com/rogerspy"
            class="social fab fa-github flat-btn"
            target="_blank"
            rel="external nofollow noopener noreferrer">
          </a>
        
      
        
          <a href="https://music.163.com/#/user/home?id=1960721923"
            class="social fas fa-headphones-alt flat-btn"
            target="_blank"
            rel="external nofollow noopener noreferrer">
          </a>
        
      
    </div>
  
  <br>
  <div><p>博客内容遵循 <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/deed.zh">署名-非商业性使用-相同方式共享 4.0 国际 (CC BY-NC-SA 4.0) 协议</a></p>
</div>
  <div>
    本站使用
    <a href="https://xaoxuu.com/wiki/material-x/" target="_blank" class="codename">Material X</a>
    作为主题
    
      ，
      总访问量为
      <span id="busuanzi_value_site_pv"><i class="fas fa-spinner fa-spin fa-fw" aria-hidden="true"></i></span>
      次
    
    。
  </div>
	</footer>

<script>setLoadingBarProgress(80);</script>
<!-- 点击特效，输入特效 运行时间 -->
<script type="text/javascript" src="/cool/cooltext.js"></script>
<script type="text/javascript" src="/cool/clicklove.js"></script>
<script type="text/javascript" src="/cool/sitetime.js"></script>



      <script>setLoadingBarProgress(60);</script>
    </div>
    <a class="s-top fas fa-arrow-up fa-fw" href='javascript:void(0)'></a>
  </div>
  <script src="https://cdn.jsdelivr.net/npm/jquery@3.3.1/dist/jquery.min.js"></script>

  <script>
    var GOOGLE_CUSTOM_SEARCH_API_KEY = "";
    var GOOGLE_CUSTOM_SEARCH_ENGINE_ID = "";
    var ALGOLIA_API_KEY = "";
    var ALGOLIA_APP_ID = "";
    var ALGOLIA_INDEX_NAME = "";
    var AZURE_SERVICE_NAME = "";
    var AZURE_INDEX_NAME = "";
    var AZURE_QUERY_KEY = "";
    var BAIDU_API_ID = "";
    var SEARCH_SERVICE = "hexo" || "hexo";
    var ROOT = "/"||"/";
    if(!ROOT.endsWith('/'))ROOT += '/';
  </script>

<script src="//instant.page/1.2.2" type="module" integrity="sha384-2xV8M5griQmzyiY3CDqh1dn4z3llDVqZDqzjzcY+jCBCk/a5fXJmuZ/40JJAPeoU"></script>


  <script async src="https://cdn.jsdelivr.net/npm/scrollreveal@4.0.5/dist/scrollreveal.min.js"></script>
  <script type="text/javascript">
    $(function() {
      const $reveal = $('.reveal');
      if ($reveal.length === 0) return;
      const sr = ScrollReveal({ distance: 0 });
      sr.reveal('.reveal');
    });
  </script>


  <script src="https://cdn.jsdelivr.net/npm/node-waves@0.7.6/dist/waves.min.js"></script>
  <script type="text/javascript">
    $(function() {
      Waves.attach('.flat-btn', ['waves-button']);
      Waves.attach('.float-btn', ['waves-button', 'waves-float']);
      Waves.attach('.float-btn-light', ['waves-button', 'waves-float', 'waves-light']);
      Waves.attach('.flat-box', ['waves-block']);
      Waves.attach('.float-box', ['waves-block', 'waves-float']);
      Waves.attach('.waves-image');
      Waves.init();
    });
  </script>


  <script async src="https://cdn.jsdelivr.net/gh/xaoxuu/cdn-busuanzi@2.3/js/busuanzi.pure.mini.js"></script>




  
  
  
    <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery-backstretch/2.0.4/jquery.backstretch.min.js"></script>
    <script type="text/javascript">
      $(function(){
        if ('.cover') {
          $('.cover').backstretch(
          ["https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/a0c9e6f9efad8b731cb7376504bd10d79d2053.jpg"],
          {
            duration: "6000",
            fade: "2500"
          });
        } else {
          $.backstretch(
          ["https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/a0c9e6f9efad8b731cb7376504bd10d79d2053.jpg"],
          {
            duration: "6000",
            fade: "2500"
          });
        }
      });
    </script>
  







  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/gitalk@1/dist/gitalk.css">
  <script src="https://cdn.jsdelivr.net/npm/gitalk@1/dist/gitalk.min.js"></script>
  <script type="text/javascript">
    var gitalk = new Gitalk({
      clientID: "35a5e4dc744cc7d162af",
      clientSecret: "7b5a409e17ce0c1971f284eac9f8902eb4b8feba",
      repo: "rogerspy.github.io",
      owner: "Rogerspy",
      admin: "Rogerspy",
      
        id: "/wiki/material-x/",
      
      distractionFreeMode: false  // Facebook-like distraction free mode
    });
    gitalk.render('gitalk-container');
  </script>





  <script src="https://cdn.jsdelivr.net/gh/xaoxuu/cdn-material-x@19.5/js/app.js"></script>


  <script src="https://cdn.jsdelivr.net/gh/xaoxuu/cdn-material-x@19.5/js/search.js"></script>




<!-- 复制 -->
<script src="https://cdn.jsdelivr.net/npm/clipboard@2/dist/clipboard.min.js"></script>
<script>
  let COPY_SUCCESS = "复制成功";
  let COPY_FAILURE = "复制失败";
  /*页面载入完成后，创建复制按钮*/
  !function (e, t, a) {
    /* code */
    var initCopyCode = function(){
      var copyHtml = '';
      copyHtml += '<button class="btn-copy" data-clipboard-snippet="">';
      copyHtml += '  <i class="fa fa-copy"></i><span>复制</span>';
      copyHtml += '</button>';
      $(".highlight .code pre").before(copyHtml);
      var clipboard = new ClipboardJS('.btn-copy', {
        target: function(trigger) {
          return trigger.nextElementSibling;
        }
      });

      clipboard.on('success', function(e) {
        //您可以加入成功提示
        console.info('Action:', e.action);
        console.info('Text:', e.text);
        console.info('Trigger:', e.trigger);
        success_prompt(COPY_SUCCESS);
        e.clearSelection();
      });
      clipboard.on('error', function(e) {
        //您可以加入失败提示
        console.error('Action:', e.action);
        console.error('Trigger:', e.trigger);
        fail_prompt(COPY_FAILURE);
      });
    }
    initCopyCode();

  }(window, document);

  /**
   * 弹出式提示框，默认1.5秒自动消失
   * @param message 提示信息
   * @param style 提示样式，有alert-success、alert-danger、alert-warning、alert-info
   * @param time 消失时间
   */
  var prompt = function (message, style, time)
  {
      style = (style === undefined) ? 'alert-success' : style;
      time = (time === undefined) ? 1500 : time*1000;
      $('<div>')
          .appendTo('body')
          .addClass('alert ' + style)
          .html(message)
          .show()
          .delay(time)
          .fadeOut();
  };

  // 成功提示
  var success_prompt = function(message, time)
  {
      prompt(message, 'alert-success', time);
  };

  // 失败提示
  var fail_prompt = function(message, time)
  {
      prompt(message, 'alert-danger', time);
  };

  // 提醒
  var warning_prompt = function(message, time)
  {
      prompt(message, 'alert-warning', time);
  };

  // 信息提示
  var info_prompt = function(message, time)
  {
      prompt(message, 'alert-info', time);
  };

</script>


<!-- fancybox -->
<script src="https://cdn.jsdelivr.net/gh/fancyapps/fancybox@3.5.7/dist/jquery.fancybox.min.js"></script>
<script>
  let LAZY_LOAD_IMAGE = "";
  $(".article-entry").find("fancybox").find("img").each(function () {
      var element = document.createElement("a");
      $(element).attr("data-fancybox", "gallery");
      $(element).attr("href", $(this).attr("src"));
      /* 图片采用懒加载处理时,
       * 一般图片标签内会有个属性名来存放图片的真实地址，比如 data-original,
       * 那么此处将原本的属性名src替换为对应属性名data-original,
       * 修改如下
       */
       if (LAZY_LOAD_IMAGE) {
         $(element).attr("href", $(this).attr("data-original"));
       }
      $(this).wrap(element);
  });
</script>





  <script>setLoadingBarProgress(100);</script>
</body>
</html>
